[1] "2.34.1"
modelling workflow for risk and safety
prepared for the Health & Safety Executive, Science Division
13 Feb 2025
BUGS/JAGS to StanStan, which tunes hyperparameters during warmup!BUGS/JAGS to StanHoffman, M.D., Gelman, A., ‘The No-U-Turn Sampler’, JMLR, 2014
BUGS/JAGS to StanStanCon anyone?
‘democratising scalable UQ’ …once installed 🤔
looking for evidence of active corrosion growth
3×5 DataFrame
Row │ anomaly_id soil_type inspection measured_depth_mm T
│ Int64 String1 String3 Float64 Int64
─────┼─────────────────────────────────────────────────────────────
1 │ 1 A i_0 0.0672091 0
2 │ 2 A i_0 0.104202 0
3 │ 3 A i_0 0.100588 0
data {
int<lower=0> n_anomalies; // number of anomalies
int<lower=0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower=0> mu; // mean growth rate
real<lower=0> sigma; // standard deviation
}
model {
// model
for (i in 1 : (n_anomalies * n_inspections)) {
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
delta_C ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
vector[n_anomalies * n_inspections] cgr_pred;
//vector[n_anomalies * n_inspections] log_lik;
for (i in 1 : (n_anomalies * n_inspections)) {
cgr_pred[i] = normal_rng(mu, sigma);
//log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
}
INFO:cmdstanpy:found newer exe file, not recompiling
data {
int <lower = 0> n_anomalies; // number of anomalies
int <lower = 0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower = 0> mu; // mean growth rate
real<lower = 0> sigma; // standard deviation
}
model {
// model
for(i in 1:(n_anomalies * n_inspections)){
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
delta_C ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
vector[n_anomalies * n_inspections] cgr_pred;
//vector[n_anomalies * n_inspections] log_lik;
for (i in 1:(n_anomalies * n_inspections)) {
cgr_pred[i] = normal_rng(mu, sigma);
//log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
}
CmdStanR needs it’s input data as a list
prepare_data <- function(df = corrosion_data) {
df |>
arrange(anomaly_id, T) |>
group_by(anomaly_id) |>
mutate(
next_depth = lead(measured_depth_mm),
time_diff = lead(T) - T
) |>
filter(!is.na(next_depth)) |>
mutate(
delta_C = (next_depth - measured_depth_mm) / time_diff
) |>
select(anomaly_id, delta_C, soil_type) |>
ungroup()
}
model_data <- list(
n_anomalies = prepare_data()$anomaly_id |> unique() |> length(),
n_inspections = 2,
cgr = prepare_data()$delta_C
)
cgr_post <- cgr_model$sample(data = model_data)Running MCMC with 4 sequential chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 0.1 seconds.
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 0.1 seconds.
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 0.1 seconds.
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.6 seconds.
CmdStanPy needs it’s input data as a dictionary
def prepare_data(df = corrosion_data):
return (
df.sort(['anomaly_id', 'T'])
.group_by('anomaly_id')
.agg([
pl.col('measured_depth_mm').shift(-1).alias('next_depth'),
pl.col('T').shift(-1).alias('next_time'),
pl.col('measured_depth_mm'),
pl.col('T')
])
.filter(pl.col('next_depth').is_not_null())
.with_columns([
((pl.col('next_depth') - pl.col('measured_depth_mm')) /
(pl.col('next_time') - pl.col('T'))).alias('delta_C')
])
.select(['anomaly_id', 'delta_C'])
.explode('delta_C') # Add this line to unnest the lists
.filter(pl.col('delta_C').is_not_null()) # Optional: remove null values if any
)
model_data = {
'n_anomalies': prepare_data().select('anomaly_id').unique().height,
'n_inspections': 2,
'cgr': prepare_data().select('delta_C').to_series().to_numpy()
}
cgr_post = cgr_model.sample(data = model_data)
INFO:cmdstanpy:CmdStan start processing
chain 1 |[33m [0m| 00:00 Status
chain 2 |[33m [0m| 00:00 Status[A
chain 3 |[33m [0m| 00:00 Status[A[A
chain 4 |[33m [0m| 00:00 Status[A[A[A
chain 2 |[34m######### [0m| 00:00 Iteration: 1700 / 2000 [ 85%] (Sampling)[A
chain 1 |[34m#########5[0m| 00:00 Iteration: 1800 / 2000 [ 90%] (Sampling)
chain 3 |[34m########1 [0m| 00:00 Iteration: 1500 / 2000 [ 75%] (Sampling)[A[A
chain 4 |[34m#########5[0m| 00:00 Iteration: 1800 / 2000 [ 90%] (Sampling)[A[A[A
chain 1 |[34m##########[0m| 00:00 Sampling completed
chain 2 |[34m##########[0m| 00:00 Sampling completed
chain 3 |[34m##########[0m| 00:00 Sampling completed
chain 4 |[34m##########[0m| 00:00 Sampling completed
INFO:cmdstanpy:CmdStan done processing.
A Turing model needs it’s input data as arguments to the model function
function prepare_data(df::DataFrame = corrosion_data)
sorted_df = sort(df, [:anomaly_id, :T]); result = DataFrame()
for group in groupby(sorted_df, :anomaly_id)
if nrow(group) > 1
for i in 1:(nrow(group)-1)
Δc = (group[i+1, :measured_depth_mm] - group[i, :measured_depth_mm]) / (group[i+1, :T] - group[i, :T])
push!(result, (
anomaly_id = group[i, :anomaly_id], Δc = max(0, Δc)
))
end
end
end
return result
end
cgr_post = prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, NUTS(), 1_000)# A draws_df: 1000 iterations, 4 chains, and 53 variables
lp__ mu sigma cgr_pred[1] cgr_pred[2] cgr_pred[3] cgr_pred[4] cgr_pred[5]
1 130 0.083 0.046 0.013 0.085 0.070 0.097 0.114
2 129 0.087 0.046 0.083 0.100 0.039 0.054 0.051
3 130 0.079 0.041 0.064 0.041 -0.021 0.124 0.130
4 130 0.081 0.043 0.029 0.089 0.093 0.135 0.095
5 131 0.083 0.041 0.074 0.062 0.060 0.069 0.044
6 130 0.082 0.038 0.108 -0.016 0.111 0.064 0.094
7 130 0.079 0.043 0.087 0.049 -0.019 0.057 0.117
8 128 0.095 0.039 0.086 0.176 0.110 0.089 0.102
9 128 0.069 0.044 0.047 0.056 -0.011 0.127 0.088
10 129 0.071 0.040 0.090 0.059 0.027 0.109 0.035
# ... with 3990 more draws, and 45 more variables
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
# A tibble: 8,000 × 4
Parameter Chain Iteration value
<chr> <int> <int> <dbl>
1 mu 1 1 0.0830
2 mu 1 2 0.0875
3 mu 1 3 0.0792
4 mu 1 4 0.0807
5 mu 1 5 0.0827
6 mu 1 6 0.0824
7 mu 1 7 0.0793
8 mu 1 8 0.0948
9 mu 1 9 0.0690
10 mu 1 10 0.0712
# ℹ 7,990 more rows
array([[[ 1.30511e+02, 9.75941e-01, 7.47178e-01, ..., 2.85270e-03,
1.01211e-01, 1.14241e-01],
[ 1.26794e+02, 8.16985e-01, 7.13934e-01, ..., 1.10709e-01,
3.40024e-02, 8.85645e-02],
[ 1.30074e+02, 9.94407e-01, 6.84990e-01, ..., 7.31923e-02,
1.11121e-01, 6.49327e-02],
[ 1.29947e+02, 9.83667e-01, 7.72327e-01, ..., 3.66501e-02,
7.89994e-02, 1.18494e-01]],
[[ 1.30388e+02, 9.89454e-01, 7.47178e-01, ..., 8.69072e-02,
3.37897e-02, 9.25661e-02],
[ 1.28357e+02, 1.00000e+00, 7.13934e-01, ..., 1.03536e-01,
8.78874e-02, 7.51175e-02],
[ 1.29825e+02, 9.89659e-01, 6.84990e-01, ..., 1.45836e-01,
1.17892e-01, 9.41959e-02],
[ 1.30160e+02, 9.85570e-01, 7.72327e-01, ..., 5.90831e-02,
8.81251e-02, 6.52294e-02]],
[[ 1.30241e+02, 9.88424e-01, 7.47178e-01, ..., 2.16155e-02,
7.24129e-02, 1.02635e-01],
[ 1.28479e+02, 1.00000e+00, 7.13934e-01, ..., 9.62566e-02,
4.77785e-02, 1.24287e-01],
[ 1.30376e+02, 1.00000e+00, 6.84990e-01, ..., 9.49858e-02,
8.78894e-02, 3.76091e-02],
[ 1.30277e+02, 9.92040e-01, 7.72327e-01, ..., -5.04970e-03,
8.21757e-02, 7.99845e-02]],
...,
[[ 1.29980e+02, 9.92327e-01, 7.47178e-01, ..., 9.34758e-02,
8.81039e-02, 8.87766e-02],
[ 1.27094e+02, 8.98094e-01, 7.13934e-01, ..., 1.67347e-01,
7.64573e-02, 1.55398e-01],
[ 1.30398e+02, 9.36709e-01, 6.84990e-01, ..., 7.45936e-02,
9.61824e-02, 1.78366e-01],
[ 1.30322e+02, 9.36647e-01, 7.72327e-01, ..., 9.29926e-02,
1.03869e-01, 1.98546e-02]],
[[ 1.29567e+02, 9.44852e-01, 7.47178e-01, ..., 7.06839e-02,
9.29076e-02, 7.62933e-02],
[ 1.29801e+02, 9.69785e-01, 7.13934e-01, ..., 5.52811e-02,
1.21212e-01, 2.58968e-02],
[ 1.29925e+02, 9.67583e-01, 6.84990e-01, ..., 9.35320e-02,
7.62850e-02, 2.78032e-02],
[ 1.30498e+02, 1.00000e+00, 7.72327e-01, ..., 1.28179e-01,
9.27318e-02, 7.85656e-02]],
[[ 1.29641e+02, 9.26465e-01, 7.47178e-01, ..., 2.57477e-02,
4.39165e-02, 1.69162e-01],
[ 1.29784e+02, 8.43336e-01, 7.13934e-01, ..., 6.84627e-02,
1.58820e-01, 1.42270e-01],
[ 1.30276e+02, 9.76230e-01, 6.84990e-01, ..., 6.78183e-02,
7.87947e-02, 8.21316e-02],
[ 1.30363e+02, 9.83632e-01, 7.72327e-01, ..., 1.22037e-01,
-2.18041e-03, 8.66952e-02]]])
lp__ accept_stat__ ... cgr_pred[49] cgr_pred[50]
0 130.511 0.975941 ... 0.101211 0.114241
1 130.388 0.989454 ... 0.033790 0.092566
2 130.241 0.988424 ... 0.072413 0.102635
3 128.701 0.882168 ... 0.075803 0.041847
4 129.255 1.000000 ... 0.128673 0.084683
... ... ... ... ... ...
3995 129.794 1.000000 ... 0.037539 0.086167
3996 130.408 1.000000 ... 0.074031 0.071042
3997 130.322 0.936647 ... 0.103869 0.019855
3998 130.498 1.000000 ... 0.092732 0.078566
3999 130.363 0.983632 ... -0.002180 0.086695
[4000 rows x 59 columns]
1000×16 DataFrame
Row │ iteration chain μ σ lp n_steps is_accept a ⋯
│ Int64 Int64 Float64 Float64 Float64 Float64 Float64 F ⋯
──────┼─────────────────────────────────────────────────────────────────────────
1 │ 501 1 0.0885972 0.036928 84.5361 3.0 1.0 ⋯
2 │ 502 1 0.0940632 0.0379282 82.8881 1.0 1.0
3 │ 503 1 0.0901949 0.0350495 83.3674 3.0 1.0
4 │ 504 1 0.0847086 0.0315686 82.4833 3.0 1.0
5 │ 505 1 0.0782442 0.0368907 85.4057 3.0 1.0 ⋯
6 │ 506 1 0.081535 0.0384244 85.8435 3.0 1.0
7 │ 507 1 0.0781813 0.0500135 85.2838 3.0 1.0
8 │ 508 1 0.0802804 0.0412565 86.1106 3.0 1.0
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
994 │ 1494 1 0.0859129 0.0408179 85.6162 7.0 1.0 ⋯
995 │ 1495 1 0.0833438 0.0437737 85.8445 3.0 1.0
996 │ 1496 1 0.0812184 0.0364029 85.3886 3.0 1.0
997 │ 1497 1 0.0812184 0.0364029 85.3886 1.0 1.0
998 │ 1498 1 0.0824591 0.0375846 85.6518 1.0 1.0 ⋯
999 │ 1499 1 0.0648397 0.0572539 83.7585 7.0 1.0
1000 │ 1500 1 0.0717195 0.0492112 85.284 3.0 1.0
9 columns and 985 rows omitted
success
next up: how we can extend this to a robust and helpful workflow
☕
Turing.jln_draws = 1_000; n_chains = 4
# a no U-turn sampler, with 2000 adaptive steps and a target acceptance rate of 0.65
NUTS_sampler = NUTS(2_000, 0.65)
# a Hamiltonian Monte Carlo sampler, with a step size of 0.05 and 10 leapfrog steps
HMC_sampler = HMC(0.05, 10)
# a Metropolis-Hastings sampler, using the default proposal distribution (priors)
MH_sampler = MH()
# a 'compositional' Gibbs sampler (Metropolis within Gibbs) - sampling μ with MH and σ with NUTS
Gibbs_sampler = Gibbs(MH(:μ), NUTS(2_000, 0.65, :σ))
run_mcmc = function(sampler)
return prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, sampler, MCMCThreads(), n_draws, n_chains)
end